# **FOREWORD**

Thanks to the baseline work [here](https://www.kaggle.com/code/masayakawamata/s5e11-te-xgb-interaction-features). I add a couple of ML models to the original work and complete my baseline work

This is my starter work for the November Playground 2025 challenge using the train and original as rows and interaction features with mean and count encoders

In [1]:
import warnings, torch
import pandas as pd, numpy as np
warnings.simplefilter('ignore')
from itertools import combinations
from tqdm.notebook import tqdm

from sklearn.model_selection import StratifiedKFold, KFold
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBMC, log_evaluation, early_stopping
from catboost import CatBoostClassifier as CBC
from sklearn.metrics import *

from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
test_req = False

if test_req :
    print("THIS IS A SYNTAX CHECK RUN")
    nest = 200
else:
    nest = 7000

# **PREPROCESSING**

In [3]:

train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
orig = pd.read_csv('/kaggle/input/loan-prediction-dataset-2025/loan_dataset_20000.csv')

print('Train Shape:', train.shape)
print('Test Shape:', test.shape)
print('Orig Shape:', orig.shape)

TARGET = 'loan_paid_back'
CATS   = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']
BASE   = [col for col in train.columns if col not in ['id', TARGET]]

Train Shape: (593994, 13)
Test Shape: (254569, 12)
Orig Shape: (20000, 22)


# **FEATURE ENGINEERING**

In [4]:

INTER = []

for col1, col2 in tqdm(combinations(BASE, 2)):
    new_col_name = f'{col1}_{col2}'
    INTER.append(new_col_name)
    for df in [train, test, orig] :
        df[new_col_name] = df[col1].astype(str) + '_' + df[col2].astype(str)
        
print(f'{len(INTER)} Features')

for col1, col2, col3 in combinations(CATS, 3 ):
    new_col_name = f'{col1}_{col2}_{col3}'
    INTER.append(new_col_name)
    for df in [train, test, orig]:
        df[new_col_name] = df[col1].astype(str) + '_' + df[col2].astype(str) + df[col3].astype(str)
        
print(f'{len(INTER)} Features')

ORIG = []

for col in BASE:
    # MEAN
    mean_map = orig.groupby(col)[TARGET].mean()
    new_mean_col_name = f"orig_mean_{col}"
    mean_map.name = new_mean_col_name
    
    train = train.merge(mean_map, on=col, how='left')
    test = test.merge(mean_map, on=col, how='left')
    ORIG.append(new_mean_col_name)

    # COUNT
    new_count_col_name = f"orig_count_{col}"
    count_map = orig.groupby(col).size().reset_index(name=new_count_col_name)
    
    train = train.merge(count_map, on=col, how='left')
    test  = test.merge(count_map, on=col, how='left')
    ORIG.append(new_count_col_name)

print(len(ORIG), 'Orig Features Created!!')

FEATURES = BASE + ORIG + INTER
print(len(FEATURES), 'Features.')

X = train[FEATURES]
y = train[TARGET]

0it [00:00, ?it/s]

55 Features
75 Features
22 Orig Features Created!!
108 Features.


# **MODEL TRAINING**

In [5]:

N_SPLITS = 5
skf      = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)


Mdl_Master = \
{     
 f'XGB1C'  : [
              XGBC(**{ "objective"            : "binary:logistic",
                       "eval_metric"          : "auc",
                       'device'               : "cuda:0" if torch.cuda.is_available() else "cpu",
                       'learning_rate'        : 0.01,
                       'n_estimators'         : nest,
                       'max_depth'            : 8,
                       'subsample'            : 0.90,
                       'colsample_bytree'     : 0.75,
                       'reg_lambda'           : 0.75,
                       'reg_alpha'            : 0.001,
                       'verbosity'            : 0,
                       'random_state'         : 42,
                       'enable_categorical'   : True,
                       'early_stopping_rounds' : 100,
                      } 
                   ),
              {"verbose" : 0}
             ],

 f'LGBM1C'  : [
            LGBMC(**{  "objective"            : "binary",
                       "eval_metric"          : "auc",
                       'device'               : "gpu" if torch.cuda.is_available() else "cpu",
                       'learning_rate'        : 0.01,
                       'n_estimators'         : nest ,
                       'max_depth'            : 7,
                       'subsample'            : 0.90,
                       'colsample_bytree'     : 0.60,
                       'reg_lambda'           : 1.25,
                       'reg_alpha'            : 0.001,
                       'verbosity'            : -1,
                       'random_state'         : 42,
                      } 
                   ),
              {
                  "callbacks" : [
                      log_evaluation(0), 
                      early_stopping(100, verbose = False)
                  ]
              }
             ],

 f'LGBM2C'  : [
            LGBMC(**{  "objective"            : "binary",
                       "data_sample_strategy" : "goss",
                       "eval_metric"          : "auc",
                       'device'               : "gpu" if torch.cuda.is_available() else "cpu",
                       'learning_rate'        : 0.01,
                       'n_estimators'         : nest ,
                       'max_depth'            : 6,
                       'subsample'            : 0.825,
                       'colsample_bytree'     : 0.55,
                       'reg_lambda'           : 0.85,
                       'reg_alpha'            : 0.001,
                       'verbosity'            : -1,
                       'random_state'         : 42,
                      } 
                   ),
              {
                  "callbacks" : [
                      log_evaluation(0), 
                      early_stopping(100, verbose = False)
                  ]
              }
             ],
}




In [6]:
class TargetEncoder(BaseEstimator, TransformerMixin):
    """
    Target Encoder that supports multiple aggregation functions,
    internal cross-validation for leakage prevention, and smoothing.

    Parameters
    ----------
    cols_to_encode : list of str
        List of column names to be target encoded.

    aggs : list of str, default=['mean']
        List of aggregation functions to apply. Any function accepted by
        pandas' `.agg()` method is supported, such as:
        'mean', 'std', 'var', 'min', 'max', 'skew', 'nunique', 
        'count', 'sum', 'median'.
        Smoothing is applied only to the 'mean' aggregation.

    cv : int, default=5
        Number of folds for cross-validation in fit_transform.

    smooth : float or 'auto', default='auto'
        The smoothing parameter `m`. A larger value puts more weight on the 
        global mean. If 'auto', an empirical Bayes estimate is used.
        
    drop_original : bool, default=False
        If True, the original columns to be encoded are dropped.
    """
    def __init__(self, cols_to_encode, aggs=['mean'], cv=5, smooth='auto', drop_original=False):
        self.cols_to_encode = cols_to_encode
        self.aggs = aggs
        self.cv = cv
        self.smooth = smooth
        self.drop_original = drop_original
        self.mappings_ = {}
        self.global_stats_ = {}

    def fit(self, X, y):
        """
        Learn mappings from the entire dataset.
        These mappings are used for the transform method on validation/test data.
        """
        temp_df = X.copy()
        temp_df['target'] = y

        # Learn global statistics for each aggregation
        for agg_func in self.aggs:
            self.global_stats_[agg_func] = y.agg(agg_func)

        # Learn category-specific mappings
        for col in self.cols_to_encode:
            self.mappings_[col] = {}
            for agg_func in self.aggs:
                mapping = temp_df.groupby(col)['target'].agg(agg_func)
                self.mappings_[col][agg_func] = mapping
        
        return self

    def transform(self, X):
        """
        Apply learned mappings to the data.
        Unseen categories are filled with global statistics.
        """
        X_transformed = X.copy()
        for col in self.cols_to_encode:
            for agg_func in self.aggs:
                new_col_name = f'TE_{col}_{agg_func}'
                map_series = self.mappings_[col][agg_func]
                X_transformed[new_col_name] = X[col].map(map_series)
                X_transformed[new_col_name].fillna(self.global_stats_[agg_func], inplace=True)
        
        if self.drop_original:
            X_transformed.drop(columns=self.cols_to_encode, inplace=True)
            
        return X_transformed

    def fit_transform(self, X, y):
        """
        Fit and transform the data using internal cross-validation to prevent leakage.
        """
        # First, fit on the entire dataset to get global mappings for transform method
        self.fit(X, y)

        # Initialize an empty DataFrame to store encoded features
        encoded_features = pd.DataFrame(index=X.index)
        
        kf = KFold(n_splits=self.cv, shuffle=True, random_state=42)

        for train_idx, val_idx in kf.split(X, y):
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_val = X.iloc[val_idx]
            
            temp_df_train = X_train.copy()
            temp_df_train['target'] = y_train

            for col in self.cols_to_encode:
                # --- Calculate mappings only on the training part of the fold ---
                for agg_func in self.aggs:
                    new_col_name = f'TE_{col}_{agg_func}'
                    
                    # Calculate global stat for this fold
                    fold_global_stat = y_train.agg(agg_func)
                    
                    # Calculate category stats for this fold
                    mapping = temp_df_train.groupby(col)['target'].agg(agg_func)

                    # --- Apply smoothing only for 'mean' aggregation ---
                    if agg_func == 'mean':
                        counts = temp_df_train.groupby(col)['target'].count()
                        
                        m = self.smooth
                        if self.smooth == 'auto':
                            # Empirical Bayes smoothing
                            variance_between = mapping.var()
                            avg_variance_within = temp_df_train.groupby(col)['target'].var().mean()
                            if variance_between > 0:
                                m = avg_variance_within / variance_between
                            else:
                                m = 0  # No smoothing if no variance between groups
                        
                        # Apply smoothing formula
                        smoothed_mapping = (counts * mapping + m * fold_global_stat) / (counts + m)
                        encoded_values = X_val[col].map(smoothed_mapping)
                    else:
                        encoded_values = X_val[col].map(mapping)
                    
                    # Store encoded values for the validation fold
                    encoded_features.loc[X_val.index, new_col_name] = encoded_values.fillna(fold_global_stat)

        # Merge with original DataFrame
        X_transformed = X.copy()
        for col in encoded_features.columns:
            X_transformed[col] = encoded_features[col]
            
        if self.drop_original:
            X_transformed.drop(columns=self.cols_to_encode, inplace=True)
            
        return X_transformed

In [7]:

OOF_Preds    = []
Mdl_Preds    = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    
    print(f'--- Fold {fold}/{N_SPLITS} ---')
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    X_test         = test[FEATURES].copy()

    TE      = TargetEncoder(cols_to_encode=INTER, cv=5, smooth='auto', aggs=['mean'], drop_original=True)
    X_train = TE.fit_transform(X_train, y_train)
    X_val   = TE.transform(X_val)
    X_test  = TE.transform(X_test)

    X_train[CATS] = X_train[CATS].astype('category')
    X_val[CATS]   = X_val[CATS].astype('category')
    X_test[CATS]  = X_test[CATS].astype('category')

    oof_preds , test_preds = [], []
    
    for method, (model, fit_params) in Mdl_Master.items():
  
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], **fit_params)
        val_preds = pd.Series(model.predict_proba(X_val)[:, 1], name = method, index = val_idx)
        oof_preds.append(val_preds)
        test_preds.append(pd.Series( model.predict_proba(X_test)[:, 1], name = method))

        print(f"---> Model {method} fitted successfully")

    oof_preds  = pd.concat(oof_preds, axis=1)
    test_preds = pd.concat(test_preds, axis=1) 

    OOF_Preds.append(oof_preds)
    Mdl_Preds.append(test_preds)

OOF_Preds = pd.concat(OOF_Preds, axis=0, ignore_index = False).sort_index(ascending = True)
Mdl_Preds = pd.concat(Mdl_Preds, axis=0, ignore_index = False).groupby(level = 0).mean()

--- Fold 1/5 ---
---> Model XGB1C fitted successfully




---> Model LGBM1C fitted successfully
---> Model LGBM2C fitted successfully
--- Fold 2/5 ---
---> Model XGB1C fitted successfully
---> Model LGBM1C fitted successfully
---> Model LGBM2C fitted successfully
--- Fold 3/5 ---
---> Model XGB1C fitted successfully
---> Model LGBM1C fitted successfully
---> Model LGBM2C fitted successfully
--- Fold 4/5 ---
---> Model XGB1C fitted successfully
---> Model LGBM1C fitted successfully
---> Model LGBM2C fitted successfully
--- Fold 5/5 ---
---> Model XGB1C fitted successfully
---> Model LGBM1C fitted successfully
---> Model LGBM2C fitted successfully


# **ENSEMBLE**

Let's do a simple average for now and then submit. Ensembling is a key step for Playground competitions and this step needs to be improved

In [8]:
oof_preds = OOF_Preds.mean(axis = 1).values
score     = roc_auc_score(y, oof_preds)
print(f"---> Score = {score:,.8f}\n")

test_preds = Mdl_Preds.mean(axis = 1).values

sub_fl = pd.read_csv(
    f"/kaggle/input/playground-series-s5e11/sample_submission.csv",
    index_col = "id",
)
sub_fl["loan_paid_back"] = test_preds

sub_fl.to_csv(f"submission.csv", index = True)

print()
!head submission.csv

---> Score = 0.92580597


id,loan_paid_back
593994,0.939704894217774
593995,0.9653727358269988
593996,0.4610757428109869
593997,0.9135061741269461
593998,0.9605767669882477
593999,0.9689544657250463
594000,0.979286432938531
594001,0.9709778696495937
594002,0.9547152776166739
