In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import lightgbm as lgb
from itertools import combinations

# Set random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## Data Preprocessing and Basic Feature Engineering

In [None]:
# Load data
train_df = pd.read_csv(r"C:\Users\dillo\Downloads\train.csv")
test_df = pd.read_csv(r"C:\Users\dillo\Downloads\test.csv")
test_ids = test_df['id'].copy()

# Configuration
target = 'loan_paid_back'
categories = [
    'gender', 'marital_status', 'education_level', 'employment_status',
    'loan_purpose', 'grade_subgrade', 'annual_income_bin', 'loan_amount_bin',
    'debt_to_income_ratio', 'credit_score', 'interest_rate'
]
SMOOTHING = 100
N_SPLITS = 5
N_BINS = 250

# Quantile Binning on Training Data
train_df['annual_income_bin'], income_bins = pd.qcut(
    train_df['annual_income'], N_BINS, labels=False, duplicates='drop', retbins=True
)
train_df['loan_amount_bin'], loan_bins = pd.qcut(
    train_df['loan_amount'], N_BINS, labels=False, duplicates='drop', retbins=True
)

# Apply Bins to Test Data
test_df['annual_income_bin'] = pd.cut(
    test_df['annual_income'], bins=income_bins, labels=False, include_lowest=True
)
test_df['loan_amount_bin'] = pd.cut(
    test_df['loan_amount'], bins=loan_bins, labels=False, include_lowest=True
)

# Handle Out-Of-Range Values
test_df['annual_income_bin'] = test_df['annual_income_bin'].fillna(-1).astype(int)
test_df['loan_amount_bin'] = test_df['loan_amount_bin'].fillna(-1).astype(int)

# K-Fold Setup
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
global_mean = train_df[target].mean()
pair_cols = list(combinations(categories, 2))

single_te_cols = [f'{col}_te' for col in categories]
pair_te_cols = [f'{col1}_{col2}_te' for col1, col2 in pair_cols]
pair_freq_cols = [f'{col1}_{col2}_freq' for col1, col2 in pair_cols]

# Pre-allocate Empty Numpy Arrays for New Features
n_train = len(train_df)
n_test = len(test_df)

train_single_te = np.zeros((n_train, len(categories)), dtype=np.float32)
train_pair_te = np.zeros((n_train, len(pair_cols)), dtype=np.float32)
train_pair_freq = np.zeros((n_train, len(pair_cols)), dtype=np.float32)

test_single_te = np.zeros((n_test, len(categories)), dtype=np.float32)
test_pair_te = np.zeros((n_test, len(pair_cols)), dtype=np.float32)
test_pair_freq = np.zeros((n_test, len(pair_cols)), dtype=np.float32)

# K-Fold Feature Engineering for Train Data
for train_idx, val_idx in kf.split(train_df):
    train_fold = train_df.iloc[train_idx]
    val_fold = train_df.iloc[val_idx]
    
    # Single-Column Target Encoding with Smoothing
    for i, col in enumerate(categories):
        agg = train_fold.groupby(col, observed=False)[target].agg(['sum', 'count'])
        smoothed_means = (agg['sum'] + global_mean * SMOOTHING) / (agg['count'] + SMOOTHING)
        train_single_te[val_idx, i] = val_fold[col].map(smoothed_means).fillna(global_mean).values
    
    # Pairwise Target & Frequency Encoding
    for j, (col1, col2) in enumerate(pair_cols):
        # Group by multiple columns directly
        grouped = train_fold.groupby([col1, col2], observed=False)
        
        # Target Encoding with Smoothing
        agg = grouped[target].agg(['sum', 'count'])
        smoothed_means = (agg['sum'] + global_mean * SMOOTHING) / (agg['count'] + SMOOTHING)
        
        # Map to Validation Fold using MultiIndex
        val_index = pd.MultiIndex.from_arrays([val_fold[col1].values, val_fold[col2].values])
        train_pair_te[val_idx, j] = smoothed_means.reindex(val_index, fill_value=global_mean).values
        
        # Frequency Encoding
        freq_counts = grouped.size()
        train_pair_freq[val_idx, j] = freq_counts.reindex(val_index, fill_value=0).values

# Feature Engineering for Test Data (using full training data)
# Single-Column Target Encoding with Smoothing
for i, col in enumerate(categories):
    agg = train_df.groupby(col, observed=False)[target].agg(['sum', 'count'])
    smoothed_means = (agg['sum'] + global_mean * SMOOTHING) / (agg['count'] + SMOOTHING)
    test_single_te[:, i] = test_df[col].map(smoothed_means).fillna(global_mean).values

# Pairwise Target & Frequency Encoding
for j, (col1, col2) in enumerate(pair_cols):
    grouped = train_df.groupby([col1, col2], observed=False)
    
    # Target Encoding with Smoothing
    agg = grouped[target].agg(['sum', 'count'])
    smoothed_means = (agg['sum'] + global_mean * SMOOTHING) / (agg['count'] + SMOOTHING)
    
    # Map to Test Set using MultiIndex
    test_index = pd.MultiIndex.from_arrays([test_df[col1].values, test_df[col2].values])
    test_pair_te[:, j] = smoothed_means.reindex(test_index, fill_value=global_mean).values
    
    # Frequency Encoding
    freq_counts = grouped.size()
    test_pair_freq[:, j] = freq_counts.reindex(test_index, fill_value=0).values

# Convert arrays to DataFrames
train_single_te_df = pd.DataFrame(train_single_te, columns=single_te_cols, index=train_df.index)
train_pair_te_df = pd.DataFrame(train_pair_te, columns=pair_te_cols, index=train_df.index)
train_pair_freq_df = pd.DataFrame(train_pair_freq, columns=pair_freq_cols, index=train_df.index)

test_single_te_df = pd.DataFrame(test_single_te, columns=single_te_cols, index=test_df.index)
test_pair_te_df = pd.DataFrame(test_pair_te, columns=pair_te_cols, index=test_df.index)
test_pair_freq_df = pd.DataFrame(test_pair_freq, columns=pair_freq_cols, index=test_df.index)

# Concatenate all features at once
train_df = pd.concat([train_df, train_single_te_df, train_pair_te_df, train_pair_freq_df], axis=1)
test_df = pd.concat([test_df, test_single_te_df, test_pair_te_df, test_pair_freq_df], axis=1)

train_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

# Convert Categorical Columns to 'category' dtype
for col in categories:
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

numeric = [col for col in train_df.columns if col not in categories + [target]]

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Number of features created: {len(single_te_cols) + len(pair_te_cols) + len(pair_freq_cols)}")

Training data shape: (593994, 135)
Test data shape: (254569, 134)
Number of features created: 121


## **Function to Get Feature Importance**

In [None]:
def get_feature_importances(train_df, target, numerical, categorical, random_state=RANDOM_SEED):
    """
    Train a quick LightGBM model and return feature importances
    """
    df = train_df.copy()
    for col in categorical:
        df[col] = train_df[col].astype('category')
    
    features = numerical + categorical
    X = df[features]
    y = df[target]
    
    # Train LightGBM
    model = lgb.LGBMRegressor(n_estimators=1000, random_state=random_state, verbose=-1)
    model.fit(X, y, categorical_feature=categorical)
    
    # Get Feature Importance
    importances = pd.Series(model.feature_importances_, index=features)
    importances = importances.sort_values(ascending=False)
    
    print("Feature importances:")
    for feat, imp in importances.items():
        print(f"{feat}: {imp}")
    
    return importances

In [None]:
# Get Feature Importance
importances = get_feature_importances(train_df, target, numeric, categories)

# Remove Columns where Importance <= 10
zero_feats = list(importances[importances <= 15].index)
print("Drop Columns:")
print(zero_feats)

numeric = [col for col in numeric if col not in zero_feats]
categories = [col for col in categories if col not in zero_feats]
print("Numeric Columns:")
print(numeric)
print("Categorical Columns:")
print(categories)

train_df = train_df.drop(columns=zero_feats)

Drop Columns:
['gender_grade_subgrade_freq', 'credit_score_interest_rate_freq', 'loan_amount_bin_te', 'gender_marital_status_freq', 'annual_income_bin_loan_amount_bin_freq', 'debt_to_income_ratio_interest_rate_freq', 'gender_loan_purpose_freq', 'grade_subgrade_loan_amount_bin_freq', 'loan_purpose_te', 'marital_status_te', 'annual_income_bin_interest_rate_freq', 'education_level_grade_subgrade_freq', 'gender_education_level_te', 'marital_status_education_level_freq', 'loan_amount_bin_interest_rate_freq', 'loan_amount_bin_credit_score_freq', 'loan_purpose', 'education_level_te', 'gender_te', 'education_level', 'gender', 'marital_status']
Numeric Columns:
['annual_income', 'loan_amount', 'employment_status_te', 'annual_income_bin_te', 'debt_to_income_ratio_te', 'credit_score_te', 'interest_rate_te', 'gender_marital_status_te', 'gender_employment_status_te', 'gender_grade_subgrade_te', 'gender_annual_income_bin_te', 'gender_debt_to_income_ratio_te', 'gender_credit_score_te', 'gender_intere

In [None]:
# Test AUC of LightGBM
X = train_df.drop(columns=[target])
y = train_df[target]

# 70-30 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_SEED, stratify=y)

# Train LightGBM with categorical features specified
print("\nTraining LightGBM...")
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categories)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, categorical_feature=categories)

params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'random_state': RANDOM_SEED
}

model = lgb.train(params, lgb_train, num_boost_round=10000, valid_sets=[lgb_eval], callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)])

# Predict and calculate AUC
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
auc = roc_auc_score(y_test, y_pred)

print(f"\nTest AUC: {auc:.4f}")
print(f"Best iteration: {model.best_iteration}")
print(f"Total features used: {len(X.columns)}")


Training LightGBM...
Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.918603
[200]	valid_0's auc: 0.920814
[300]	valid_0's auc: 0.922464
[400]	valid_0's auc: 0.923224
[500]	valid_0's auc: 0.923465
[600]	valid_0's auc: 0.923537
Early stopping, best iteration is:
[617]	valid_0's auc: 0.923545

Test AUC: 0.9235
Best iteration: 617
Total features used: 112


In [None]:
drop_cols = ['gender_grade_subgrade_freq', 'credit_score_interest_rate_freq',
             'loan_amount_bin_te', 'gender_marital_status_freq', 'annual_income_bin_loan_amount_bin_freq',
             'debt_to_income_ratio_interest_rate_freq', 'gender_loan_purpose_freq',
             'grade_subgrade_loan_amount_bin_freq', 'loan_purpose_te', 'marital_status_te',
             'annual_income_bin_interest_rate_freq', 'education_level_grade_subgrade_freq',
             'gender_education_level_te', 'marital_status_education_level_freq',
             'loan_amount_bin_interest_rate_freq', 'loan_amount_bin_credit_score_freq', 'loan_purpose',
             'education_level_te', 'gender_te', 'education_level', 'gender', 'marital_status']