<a href="https://colab.research.google.com/github/coded-sly/s5e11-Loan-Payback/blob/main/ps5e11_nn_fe_cv6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import subprocess

# Check what’s installed
result = subprocess.run(['pip', 'list'], capture_output=True, text=True)
installed = set()
for line in result.stdout.splitlines():
    # Skip header lines if present
    if line.strip() and not line.lower().startswith('package'):
        # Package name usually is the first column
        name = line.split()[0]
        installed.add(name)

# Target packages
targets = ['skrub', 'tabicl', 'tabpfn', 'tabpfn_extensions', 'pytabkit']

# Collect missing ones
installations = [pkg for pkg in targets if pkg not in installed]

if installations:
    print(f"Installing missing packages: {installations}")
    subprocess.run(['pip', 'install', '-q'] + installations)
else:
    print("All target packages are already installed.")

Installing missing packages: ['skrub', 'tabicl', 'tabpfn', 'tabpfn_extensions', 'pytabkit']


In [2]:
## -- Device-Agnostic for GPU --
import torch
print(f"ℹ️ Cuda available: {torch.cuda.is_available()}")

# import cuml.accel
# cuml.accel.install()

%load_ext cudf.pandas
from cuml.preprocessing import TargetEncoder as cuTE

ℹ️ Cuda available: True


In [3]:
## -- IMPORT LIBRARIES --
import sys, os, gc

## -- DATA MANIPUALATION --
import numpy as np, pandas as pd, random

## -- VISUALISATION --
from IPython.display import display, Image
import matplotlib.pyplot as plt
import seaborn as sns
# import shap

## -- FUNCTIONAL TOOLS --
from time import time, sleep
from tqdm.notebook import tqdm
from itertools import combinations, product

import sklearn
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.metrics import roc_auc_score

## -- MACHINE LEARNING --
from skrub import TableVectorizer
from tabicl import TabICLClassifier
from tabpfn import TabPFNClassifier
from tabpfn_extensions import interpretability
from pytabkit import RealMLP_TD_Classifier, TabM_D_Classifier

import warnings

In [4]:
## -- Global Settings --
# sklearn.set_config(transform_output="pandas")
warnings.simplefilter('ignore')
warnings.filterwarnings('ignore')

# pd.options.mode.copy_on_write = True
pd.set_option('display.max_columns', 1000)
# plt.style.use("ggplot")
sns.set_style("darkgrid")

## -- Set Global Seed --
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

COLOR = '\033[32m'
RESET = '\033[0m'

In [6]:
import kagglehub
# You may need to re-run this cell after logging in.
# kagglehub.login()
main_path = kagglehub.competition_download('playground-series-s5e11')

UnauthenticatedError: User is not authenticated

In [None]:
import kagglehub
original_path = kagglehub.dataset_download("nabihazahid/loan-prediction-dataset-2025")

In [None]:
### Load Data ###
PATH = "/kaggle/input/playground-series-s5e11/"
submit = pd.read_csv(PATH+"sample_submission.csv")
train = pd.read_csv(PATH+"train.csv").drop(['id'], axis=1)
test = pd.read_csv(PATH+"test.csv").drop(['id'], axis=1)

TARGET = "loan_paid_back"
NUMS = test.select_dtypes(include='number').columns.tolist()
CATS = test.select_dtypes(exclude='number').columns.tolist()
BASE = NUMS + CATS

ORIG_PATH = "/kaggle/input/loan-prediction-dataset-2025/"
orig = pd.read_csv(ORIG_PATH+"loan_dataset_20000.csv")[BASE+[TARGET]]

for (name, df) in dict(Train=train, Test=test, Original=orig).items():
    print(f"{name} shape: {df.shape}")

print(f"\nTotal Numerical: {len(NUMS)}")
print(f"Total Categorical: {len(CATS)}")
print(f"Total base features: {len(BASE)}")

Train shape: (593994, 12)
Test shape: (254569, 11)
Original shape: (20000, 12)

Total Numerical: 5
Total Categorical: 6
Total base features: 11


# FEATURE ENGINEERING

In [None]:
## -- Define categories for OHE --
CAT_COLS = [c for c in BASE if train[c].dtype=='object' or train[c].nunique() <= 0.01*len(train)]
print(f"ℹ️ Features for TE: {len(CAT_COLS)} -> {CAT_COLS}")

ℹ️ Features for TE: 9 -> ['debt_to_income_ratio', 'credit_score', 'interest_rate', 'gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']


In [None]:
## -- Factorize using combined data --
for c in tqdm([i for i in CATS if train[i].dtype == 'object']):
    combined = pd.concat([train[c], test[c], orig[c]], axis=0)
    combined = combined.factorize()[0]
    train[c] = combined[:len(train)].astype("int8")
    test[c]  = combined[len(train):len(train)+len(test)].astype("int8")
    orig[c]  = combined[-len(orig):].astype("int8")

print('Label encoding complete!!')

  0%|          | 0/6 [00:00<?, ?it/s]

Label encoding complete!!


In [None]:
ROUND = []
for col in tqdm(['annual_income', 'loan_amount']):
    for r in [-3, -2, -1, 0]:
        n = f"{col}_r{r}"
        train[n] = train[col].round(r).astype(int)
        test[n] = test[col].round(r).astype(int)
        orig[n] = test[col].round(r).astype(int)
        ROUND.append(n)

print(f'{len(ROUND)} ROUND Features created.')
print(train[ROUND].nunique())

DIGITS = []
for col in tqdm(['debt_to_income_ratio']):
    for d in range(1, 4):
        n = f'{col}_d{d}'
        train[n] = ((train[col] * 10**d) % 10).fillna(-1).astype(int)
        test[n]  = ((test[col]  * 10**d) % 10).fillna(-1).astype(int)
        orig[n]  = ((orig[col]  * 10**d) % 10).fillna(-1).astype(int)

        ## -- Drop constant features --
        if train[n].nunique() < 2:
            train.drop([n], axis=1, inplace=True)
            test.drop([n],  axis=1, inplace=True)
            orig.drop([n],  axis=1, inplace=True)
        else:
            DIGITS.append(n)

for col in tqdm(['interest_rate']):
    for d in range(1, 3):
        n = f'{col}_d{d}'
        train[n] = ((train[col] * 10**d) % 10).fillna(-1).astype(int)
        test[n]  = ((test[col]  * 10**d) % 10).fillna(-1).astype(int)
        orig[n]  = ((orig[col]  * 10**d) % 10).fillna(-1).astype(int)

        ## -- Drop constant features --
        if train[n].nunique() < 2:
            train.drop([n], axis=1, inplace=True)
            test.drop([n],  axis=1, inplace=True)
            orig.drop([n],  axis=1, inplace=True)
        else:
            DIGITS.append(n)

print(f'{len(DIGITS)} Features DIGITS extracted.')
print(train[DIGITS].nunique())

INTER = []
for col1, col2 in tqdm(list(combinations(CAT_COLS+DIGITS, 2)), desc="Pairwise"):
    new_col = f"{col1}-|-{col2}"
    train[new_col] = train[col1].astype(str) + '_' + train[col2].astype(str)
    test[new_col] = test[col1].astype(str) + '_' + test[col2].astype(str)
    INTER.append(new_col)

print(f"Ineraction Features: {len(INTER)}")

  0%|          | 0/2 [00:00<?, ?it/s]

8 ROUND Features created.
annual_income_r-3      270
annual_income_r-2     1721
annual_income_r-1     9706
annual_income_r0     35523
loan_amount_r-3         49
loan_amount_r-2        426
loan_amount_r-1       3558
loan_amount_r0       21644
dtype: int64


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

5 Features DIGITS extracted.
debt_to_income_ratio_d1     7
debt_to_income_ratio_d2    10
debt_to_income_ratio_d3    10
interest_rate_d1           10
interest_rate_d2           10
dtype: int64


Pairwise:   0%|          | 0/91 [00:00<?, ?it/s]

Ineraction Features: 91


In [None]:
ORIG = []

if 'y' in orig.columns:
    BASE.append('y')

for col in tqdm(BASE): # +ROUND
    # MEAN
    mean_map = orig.groupby(col)[TARGET].mean()
    mean_col = f"OTE_{col}_mean"
    mean_map.name = mean_col

    train = train.merge(mean_map, on=col, how='left')
    test = test.merge(mean_map, on=col, how='left')
    ORIG.append(mean_col)

    # COUNT
    count_col = f"OTE_{col}_count"
    count_map = orig.groupby(col).size().reset_index(name=count_col)

    train = train.merge(count_map, on=col, how='left')
    test = test.merge(count_map, on=col, how='left')
    ORIG.append(count_col)

print(len(ORIG), 'Orig Features Created!!')

  0%|          | 0/11 [00:00<?, ?it/s]

22 Orig Features Created!!


In [None]:
import os, sys
from contextlib import contextmanager

@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:
            yield
        finally:
            sys.stdout = old_stdout

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TargetEncoder(BaseEstimator, TransformerMixin):
    """
    Target Encoder that supports multiple aggregation functions,
    internal cross-validation for leakage prevention, and smoothing.

    Parameters
    ----------
    cols_to_encode : list of str
        List of column names to be target encoded.

    aggs : list of str, default=['mean']
        List of aggregation functions to apply. Any function accepted by
        pandas' `.agg()` method is supported, such as:
        'mean', 'std', 'var', 'min', 'max', 'skew', 'nunique',
        'count', 'sum', 'median'.
        Smoothing is applied only to the 'mean' aggregation.

    cv : int, default=5
        Number of folds for cross-validation in fit_transform.

    smooth : float or 'auto', default='auto'
        The smoothing parameter `m`. A larger value puts more weight on the
        global mean. If 'auto', an empirical Bayes estimate is used.

    drop_original : bool, default=False
        If True, the original columns to be encoded are dropped.
    """
    def __init__(self, cols_to_encode, aggs=['mean'], cv=5, smooth='auto', drop_original=False):
        self.cols_to_encode = cols_to_encode
        self.aggs = aggs
        self.cv = cv
        self.smooth = smooth
        self.drop_original = drop_original
        self.mappings_ = {}
        self.global_stats_ = {}

    def fit(self, X, y):
        """
        Learn mappings from the entire dataset.
        These mappings are used for the transform method on validation/test data.
        """
        temp_df = X.copy()
        temp_df['target'] = y

        # Learn global statistics for each aggregation
        for agg_func in self.aggs:
            self.global_stats_[agg_func] = y.agg(agg_func)

        # Learn category-specific mappings
        for col in self.cols_to_encode:
            self.mappings_[col] = {}
            for agg_func in self.aggs:
                mapping = temp_df.groupby(col)['target'].agg(agg_func)
                self.mappings_[col][agg_func] = mapping

        return self

    def transform(self, X):
        """
        Apply learned mappings to the data.
        Unseen categories are filled with global statistics.
        """
        X_transformed = X.copy()
        for col in self.cols_to_encode:
            for agg_func in self.aggs:
                new_col_name = f'TE_{col}_{agg_func}'
                map_series = self.mappings_[col][agg_func]
                X_transformed[new_col_name] = X[col].map(map_series)
                X_transformed[new_col_name].fillna(self.global_stats_[agg_func], inplace=True)

        if self.drop_original:
            X_transformed.drop(columns=self.cols_to_encode, inplace=True)

        return X_transformed

    def fit_transform(self, X, y):
        """
        Fit and transform the data using internal cross-validation to prevent leakage.
        """
        # First, fit on the entire dataset to get global mappings for transform method
        self.fit(X, y)

        # Initialize an empty DataFrame to store encoded features
        encoded_features = pd.DataFrame(index=X.index)

        kf = KFold(n_splits=self.cv, shuffle=True, random_state=42)

        for train_idx, val_idx in kf.split(X, y):
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_val = X.iloc[val_idx]

            temp_df_train = X_train.copy()
            temp_df_train['target'] = y_train

            for col in self.cols_to_encode:
                # --- Calculate mappings only on the training part of the fold ---
                for agg_func in self.aggs:
                    new_col_name = f'TE_{col}_{agg_func}'

                    # Calculate global stat for this fold
                    fold_global_stat = y_train.agg(agg_func)

                    # Calculate category stats for this fold
                    mapping = temp_df_train.groupby(col)['target'].agg(agg_func)

                    # --- Apply smoothing only for 'mean' aggregation ---
                    if agg_func == 'mean':
                        counts = temp_df_train.groupby(col)['target'].count()

                        m = self.smooth
                        if self.smooth == 'auto':
                            # Empirical Bayes smoothing
                            variance_between = mapping.var()
                            avg_variance_within = temp_df_train.groupby(col)['target'].var().mean()
                            if variance_between > 0:
                                m = avg_variance_within / variance_between
                            else:
                                m = 0  # No smoothing if no variance between groups

                        # Apply smoothing formula
                        smoothed_mapping = (counts * mapping + m * fold_global_stat) / (counts + m)
                        encoded_values = X_val[col].map(smoothed_mapping)
                    else:
                        encoded_values = X_val[col].map(mapping)

                    # Store encoded values for the validation fold
                    encoded_features.loc[X_val.index, new_col_name] = encoded_values.fillna(fold_global_stat)

        # Merge with original DataFrame
        X_transformed = X.copy()
        for col in encoded_features.columns:
            X_transformed[col] = encoded_features[col]

        if self.drop_original:
            X_transformed.drop(columns=self.cols_to_encode, inplace=True)

        return X_transformed

In [None]:
FEATURES = [c for c in train.columns if c != TARGET]
print('Total Features', len(FEATURES))

Total Features 137


In [None]:
train.head()

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back,annual_income_r-3,annual_income_r-2,annual_income_r-1,annual_income_r0,loan_amount_r-3,loan_amount_r-2,loan_amount_r-1,loan_amount_r0,debt_to_income_ratio_d1,debt_to_income_ratio_d2,debt_to_income_ratio_d3,interest_rate_d1,interest_rate_d2,debt_to_income_ratio-|-credit_score,debt_to_income_ratio-|-interest_rate,debt_to_income_ratio-|-gender,debt_to_income_ratio-|-marital_status,debt_to_income_ratio-|-education_level,debt_to_income_ratio-|-employment_status,debt_to_income_ratio-|-loan_purpose,debt_to_income_ratio-|-grade_subgrade,debt_to_income_ratio-|-debt_to_income_ratio_d1,debt_to_income_ratio-|-debt_to_income_ratio_d2,debt_to_income_ratio-|-debt_to_income_ratio_d3,debt_to_income_ratio-|-interest_rate_d1,debt_to_income_ratio-|-interest_rate_d2,credit_score-|-interest_rate,credit_score-|-gender,credit_score-|-marital_status,credit_score-|-education_level,credit_score-|-employment_status,credit_score-|-loan_purpose,credit_score-|-grade_subgrade,credit_score-|-debt_to_income_ratio_d1,credit_score-|-debt_to_income_ratio_d2,credit_score-|-debt_to_income_ratio_d3,credit_score-|-interest_rate_d1,credit_score-|-interest_rate_d2,interest_rate-|-gender,interest_rate-|-marital_status,interest_rate-|-education_level,interest_rate-|-employment_status,interest_rate-|-loan_purpose,interest_rate-|-grade_subgrade,interest_rate-|-debt_to_income_ratio_d1,interest_rate-|-debt_to_income_ratio_d2,interest_rate-|-debt_to_income_ratio_d3,interest_rate-|-interest_rate_d1,interest_rate-|-interest_rate_d2,gender-|-marital_status,gender-|-education_level,gender-|-employment_status,gender-|-loan_purpose,gender-|-grade_subgrade,gender-|-debt_to_income_ratio_d1,gender-|-debt_to_income_ratio_d2,gender-|-debt_to_income_ratio_d3,gender-|-interest_rate_d1,gender-|-interest_rate_d2,marital_status-|-education_level,marital_status-|-employment_status,marital_status-|-loan_purpose,marital_status-|-grade_subgrade,marital_status-|-debt_to_income_ratio_d1,marital_status-|-debt_to_income_ratio_d2,marital_status-|-debt_to_income_ratio_d3,marital_status-|-interest_rate_d1,marital_status-|-interest_rate_d2,education_level-|-employment_status,education_level-|-loan_purpose,education_level-|-grade_subgrade,education_level-|-debt_to_income_ratio_d1,education_level-|-debt_to_income_ratio_d2,education_level-|-debt_to_income_ratio_d3,education_level-|-interest_rate_d1,education_level-|-interest_rate_d2,employment_status-|-loan_purpose,employment_status-|-grade_subgrade,employment_status-|-debt_to_income_ratio_d1,employment_status-|-debt_to_income_ratio_d2,employment_status-|-debt_to_income_ratio_d3,employment_status-|-interest_rate_d1,employment_status-|-interest_rate_d2,loan_purpose-|-grade_subgrade,loan_purpose-|-debt_to_income_ratio_d1,loan_purpose-|-debt_to_income_ratio_d2,loan_purpose-|-debt_to_income_ratio_d3,loan_purpose-|-interest_rate_d1,loan_purpose-|-interest_rate_d2,grade_subgrade-|-debt_to_income_ratio_d1,grade_subgrade-|-debt_to_income_ratio_d2,grade_subgrade-|-debt_to_income_ratio_d3,grade_subgrade-|-interest_rate_d1,grade_subgrade-|-interest_rate_d2,debt_to_income_ratio_d1-|-debt_to_income_ratio_d2,debt_to_income_ratio_d1-|-debt_to_income_ratio_d3,debt_to_income_ratio_d1-|-interest_rate_d1,debt_to_income_ratio_d1-|-interest_rate_d2,debt_to_income_ratio_d2-|-debt_to_income_ratio_d3,debt_to_income_ratio_d2-|-interest_rate_d1,debt_to_income_ratio_d2-|-interest_rate_d2,debt_to_income_ratio_d3-|-interest_rate_d1,debt_to_income_ratio_d3-|-interest_rate_d2,interest_rate_d1-|-interest_rate_d2,OTE_annual_income_mean,OTE_annual_income_count,OTE_debt_to_income_ratio_mean,OTE_debt_to_income_ratio_count,OTE_credit_score_mean,OTE_credit_score_count,OTE_loan_amount_mean,OTE_loan_amount_count,OTE_interest_rate_mean,OTE_interest_rate_count,OTE_gender_mean,OTE_gender_count,OTE_marital_status_mean,OTE_marital_status_count,OTE_education_level_mean,OTE_education_level_count,OTE_employment_status_mean,OTE_employment_status_count,OTE_loan_purpose_mean,OTE_loan_purpose_count,OTE_grade_subgrade_mean,OTE_grade_subgrade_count
0,29367.99,0.084,736,2528.42,13.67,0,0,0,0,0,0,1.0,29000,29400,29370,29368,3000,2500,2530,2528,0,8,4,6,7,0.084_736,0.084_13.67,0.084_0,0.084_0,0.084_0,0.084_0,0.084_0,0.084_0,0.084_0,0.084_8,0.084_4,0.084_6,0.084_7,736_13.67,736_0,736_0,736_0,736_0,736_0,736_0,736_0,736_8,736_4,736_6,736_7,13.67_0,13.67_0,13.67_0,13.67_0,13.67_0,13.67_0,13.67_0,13.67_8,13.67_4,13.67_6,13.67_7,0_0,0_0,0_0,0_0,0_0,0_0,0_8,0_4,0_6,0_7,0_0,0_0,0_0,0_0,0_0,0_8,0_4,0_6,0_7,0_0,0_0,0_0,0_0,0_8,0_4,0_6,0_7,0_0,0_0,0_0,0_8,0_4,0_6,0_7,0_0,0_0,0_8,0_4,0_6,0_7,0_0,0_8,0_4,0_6,0_7,0_8,0_4,0_6,0_7,8_4,8_6,8_7,4_6,4_7,6_7,0.0,1.0,0.884615,78,0.862069,87,1.0,1,0.8,30,0.802472,10034,0.799579,9031,0.808076,5919,0.885734,2923,0.802745,2550,0.821004,1514
1,22108.02,0.166,636,4593.1,12.92,1,1,1,1,1,1,0.0,22000,22100,22110,22108,5000,4600,4590,4593,1,6,6,9,2,0.166_636,0.166_12.92,0.166_1,0.166_1,0.166_1,0.166_1,0.166_1,0.166_1,0.166_1,0.166_6,0.166_6,0.166_9,0.166_2,636_12.92,636_1,636_1,636_1,636_1,636_1,636_1,636_1,636_6,636_6,636_9,636_2,12.92_1,12.92_1,12.92_1,12.92_1,12.92_1,12.92_1,12.92_1,12.92_6,12.92_6,12.92_9,12.92_2,1_1,1_1,1_1,1_1,1_1,1_1,1_6,1_6,1_9,1_2,1_1,1_1,1_1,1_1,1_1,1_6,1_6,1_9,1_2,1_1,1_1,1_1,1_1,1_6,1_6,1_9,1_2,1_1,1_1,1_1,1_6,1_6,1_9,1_2,1_1,1_1,1_6,1_6,1_9,1_2,1_1,1_6,1_6,1_9,1_2,1_6,1_6,1_9,1_2,6_6,6_9,6_2,6_9,6_2,9_2,1.0,1.0,0.765432,81,0.649485,97,1.0,1,0.56,25,0.797504,9536,0.800312,8974,0.797798,3724,0.886984,13007,0.799399,7981,0.727749,1146
2,49566.2,0.097,694,17005.15,9.76,1,0,0,1,1,2,1.0,50000,49600,49570,49566,17000,17000,17010,17005,0,9,7,7,6,0.097_694,0.097_9.76,0.097_1,0.097_0,0.097_0,0.097_1,0.097_1,0.097_2,0.097_0,0.097_9,0.097_7,0.097_7,0.097_6,694_9.76,694_1,694_0,694_0,694_1,694_1,694_2,694_0,694_9,694_7,694_7,694_6,9.76_1,9.76_0,9.76_0,9.76_1,9.76_1,9.76_2,9.76_0,9.76_9,9.76_7,9.76_7,9.76_6,1_0,1_0,1_1,1_1,1_2,1_0,1_9,1_7,1_7,1_6,0_0,0_1,0_1,0_2,0_0,0_9,0_7,0_7,0_6,0_1,0_1,0_2,0_0,0_9,0_7,0_7,0_6,1_1,1_2,1_0,1_9,1_7,1_7,1_6,1_2,1_0,1_9,1_7,1_7,1_6,2_0,2_9,2_7,2_7,2_6,0_9,0_7,0_7,0_6,9_7,9_7,9_6,7_7,7_6,7_6,,,0.923077,91,0.825243,103,1.0,1,0.869565,23,0.797504,9536,0.799579,9031,0.808076,5919,0.886984,13007,0.799399,7981,0.831224,1422
3,46858.25,0.065,533,4682.48,16.1,0,0,0,1,1,3,1.0,47000,46900,46860,46858,5000,4700,4680,4682,0,6,5,1,0,0.065_533,0.065_16.1,0.065_0,0.065_0,0.065_0,0.065_1,0.065_1,0.065_3,0.065_0,0.065_6,0.065_5,0.065_1,0.065_0,533_16.1,533_0,533_0,533_0,533_1,533_1,533_3,533_0,533_6,533_5,533_1,533_0,16.1_0,16.1_0,16.1_0,16.1_1,16.1_1,16.1_3,16.1_0,16.1_6,16.1_5,16.1_1,16.1_0,0_0,0_0,0_1,0_1,0_3,0_0,0_6,0_5,0_1,0_0,0_0,0_1,0_1,0_3,0_0,0_6,0_5,0_1,0_0,0_1,0_1,0_3,0_0,0_6,0_5,0_1,0_0,1_1,1_3,1_0,1_6,1_5,1_1,1_0,1_3,1_0,1_6,1_5,1_1,1_0,3_0,3_6,3_5,3_1,3_0,0_6,0_5,0_1,0_0,6_5,6_1,6_0,5_1,5_0,1_0,,,0.925373,67,0.846154,13,1.0,1,0.6,10,0.802472,10034,0.799579,9031,0.808076,5919,0.886984,13007,0.799399,7981,0.653722,309
4,25496.7,0.053,665,12184.43,10.21,1,1,0,1,0,4,1.0,25000,25500,25500,25497,12000,12200,12180,12184,0,5,3,2,0,0.053_665,0.053_10.21,0.053_1,0.053_1,0.053_0,0.053_1,0.053_0,0.053_4,0.053_0,0.053_5,0.053_3,0.053_2,0.053_0,665_10.21,665_1,665_1,665_0,665_1,665_0,665_4,665_0,665_5,665_3,665_2,665_0,10.21_1,10.21_1,10.21_0,10.21_1,10.21_0,10.21_4,10.21_0,10.21_5,10.21_3,10.21_2,10.21_0,1_1,1_0,1_1,1_0,1_4,1_0,1_5,1_3,1_2,1_0,1_0,1_1,1_0,1_4,1_0,1_5,1_3,1_2,1_0,0_1,0_0,0_4,0_0,0_5,0_3,0_2,0_0,1_0,1_4,1_0,1_5,1_3,1_2,1_0,0_4,0_0,0_5,0_3,0_2,0_0,4_0,4_5,4_3,4_2,4_0,0_5,0_3,0_2,0_0,5_3,5_2,5_0,3_2,3_0,2_0,1.0,1.0,0.943662,71,0.84375,96,0.0,1,0.740741,27,0.797504,9536,0.800312,8974,0.808076,5919,0.886984,13007,0.802745,2550,0.757576,1155


## 1. PyTabKit

In [None]:
%%time
X = train[FEATURES]
y = train[TARGET]

SPLITS = 6
kf = StratifiedKFold(n_splits=SPLITS, shuffle=True, random_state=SEED)

oof_df = pd.DataFrame()
train_df = pd.DataFrame()
test_df = pd.DataFrame()
models = []

pytab_MODELS = dict(
    # realMLP = RealMLP_TD_Classifier(
    #     random_state = SEED,
    #     val_metric_name = "1-auc_ovr",
    #     use_ls = False,
    #     ),
    tabM = TabM_D_Classifier(
        random_state = SEED,
        val_metric_name = "1-auc_ovr",
    ),
)

for model_name, model in pytab_MODELS.items():
    print(f"\nℹ️ Training {model_name}...")

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    sleep(1)

    oof_parent = np.zeros(len(X))
    test_preds = np.zeros(len(test))

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"  -- Fold {fold+1}/{SPLITS} -- ", end='')

        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        X_test = test[FEATURES].copy

        # ## -> Opt1. -> Using CUSTOM
        # TE = TargetEncoder(cols_to_encode=INTER,cv=5,smooth=1.0,aggs=['mean'],drop_original=True)
        # X_train = TE.fit_transform(X_train, y_train)
        # X_val   = TE.transform(X_val)
        # X_test  = TE.transform(X_test)

        # TE = TargetEncoder(cols_to_encode=ROUND,cv=5,smooth=1.0,aggs=['mean'],drop_original=False)
        # X_train = TE.fit_transform(X_train, y_train)
        # X_val   = TE.transform(X_val)
        # X_test  = TE.transform(X_test)

        ## -> Opt2. -> Using RAPIDS ->
        for c in INTER:
            TE = cuTE(n_folds=5, smooth=1.0, stat='mean', seed=SEED)
            X_train[c] = TE.fit_transform(X_train[c], y_train)
            X_val[c] = TE.transform(X_val[c])
            X_test[c] = TE.transform(X_test[c])

        for c in ROUND:
            TE = cuTE(n_folds=5, smooth=1.0, stat='mean', seed=SEED)
            new_col = f"TE_{c}"
            X_train[new_col] = TE.fit_transform(X_train[c], y_train)
            X_val[new_col] = TE.transform(X_val[c])
            X_test[new_col] = TE.transform(X_test[c])

        # model = make_pipeline(
        #     TableVectorizer(), # -> Automatically handles various data types
        #     model,
        # )

        with suppress_stdout():
            model.fit(X_train, y_train)

        oof_pred = model.predict_proba(X_val)[:, 1]
        oof_parent[val_idx] = oof_pred
        print(f"  AUC: {roc_auc_score(y_val, oof_pred):.5f}")

        test_preds += model.predict_proba(X_test)[:, 1]

        # del X_train, y_train, X_val, y_val, oof_pred
        # gc.collect()

    train_preds /= SPLITS
    test_preds /= SPLITS

    oof_df[model_name] = oof_parent
    train_df[model_name] = np.clip(train_preds, 0, 1)
    test_df[model_name] = np.clip(test_preds, 0, 1)
    models.append(model)

    # print('|', '-'*30)
    oof_auc = roc_auc_score(y_parent, oof_parent)
    print(f"  ☑️ {COLOR}{model_name} OOF AUC: {oof_auc:.5f}{RESET}")
    print('|', '-'*30)

    del oof_parent, train_preds, test_preds
    gc.collect()


ℹ️ Training tabM...
  -- Fold 1/6 -- 

TypeError: 'method' object is not subscriptable

In [None]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [None]:
X_train.head(3)

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
1,20172.98,0.219,531,22663.89,17.81,0,1,2,1,1,3
2,26181.8,0.234,779,3631.36,9.53,0,0,0,1,7,19
3,11873.84,0.264,809,14939.23,7.99,0,0,0,1,0,26


In [None]:
oof_df.to_parquet('orig_pytab_models_oof.parquet', index=False)
train_df.to_parquet('orig_pytab_models_train.parquet', index=False)
test_df.to_parquet('orig_pytab_models_test.parquet', index=False)

In [None]:
## -- Calculate SHAP values --
shap_values = interpretability.shap.get_shap_values(
    estimator= models[1],
    test_x= X_val.iloc[:100],
    attribute_names= X_parent.columns,
    algorithm= "permutation",
)

## -- Create visualization --
fig = interpretability.shap.plot_shap(shap_values)
fig.tight_layout()
plt.show()

TypeError: unsupported operand type(s) for -: 'str' and 'str'

## 2. PFNs

In [None]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [None]:
%%time
## -- STEP 1: Train Base Model on the 590k Child Data --
cat_indices = [X_parent.columns.get_loc(col) for col in CATS]

tfm_MODELS = dict(
    # tabPFN = TabPFNClassifier(
    #     # n_estimators=10,
    #     categorical_features_indices= cat_indices,
    #     random_state= SEED,
    #     eval_metric= 'roc_auc',
    #     ),
    tabICL = TabICLClassifier( # gpu memory issues
        random_state= SEED,
    ),
)

for model_name, model in tfm_MODELS.items():
    print(f"\nℹ️ Training {model_name}...")

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    sleep(2)

    oof_parent = np.zeros(len(X_parent))
    train_preds = np.zeros(len(X_child))
    test_preds = np.zeros(len(X_test))

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_parent, y_parent)):
        print(f"  -- Fold {fold+1}/{SPLITS} -- | ", end='')

        X_train, X_val = X_parent.iloc[train_idx], X_parent.iloc[val_idx]
        y_train, y_val = y_parent.iloc[train_idx].values, y_parent.iloc[val_idx].values

        # model = make_pipeline(
        #     TableVectorizer(), # -> Handles various data types
        #     model,
        # )
        # with suppress_stdout():
        model.fit(X_train, y_train)

        oof_pred = model.predict_proba(X_val)[:, 1]
        oof_parent[val_idx] = oof_pred

        print(f"AUC: {roc_auc_score(y_val, oof_pred):.5f}")

        train_preds += model.predict_proba(X_child)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1]

        # del X_train, y_train, X_val, y_val, oof_pred
        # gc.collect()

    train_preds /= SPLITS
    test_preds /= SPLITS

    oof_df[model_name] = oof_parent
    train_df[model_name] = np.clip(train_preds, 0, 1)
    test_df[model_name] = np.clip(test_preds, 0, 1)
    models.append(model)

    # print('|', '-'*30)
    oof_auc = roc_auc_score(y_parent, oof_parent)
    print(f"  ☑️ {COLOR}{model_name} OOF AUC: {oof_auc:.5f}{RESET}")
    print('|', '-'*30)

    del oof_parent, train_preds, test_preds
    gc.collect()



ℹ️ Training tabPFN...
  -- Fold 1/8 -- | AUC: 0.90527
  -- Fold 2/8 -- | AUC: 0.88974
  -- Fold 3/8 -- | AUC: 0.89057


## 2. TabPFN

In [None]:
oof_parent  = np.zeros(len(X_parent))
x_child_preds = np.zeros(len(X_child))
x_test_preds = np.zeros(len(X_test))
fold_scores = []

cat_indices = [X_parent.columns.get_loc(col) for col in CATS]
params = dict(
    # n_estimators=10,
    categorical_features_indices=cat_indices,
    random_state=SEED,
    eval_metric='roc_auc',
    # n_preprocessing_jobs=2
)

tik = time()
for fold, (train_idx, val_idx) in enumerate(kf.split(X_parent, y_parent)):
    print(f"|-- Fold {fold+1}/{SPLITS} --| ", end='')

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    X_train, X_val = X_parent.iloc[train_idx], X_parent.iloc[val_idx]
    y_train, y_val = y_parent.iloc[train_idx].values, y_parent.iloc[val_idx].values

    # Opt1. -> Using CUSTOM
    # TE = TargetEncoder(cols_to_encode=INTER+ROUND,cv=5,smooth=1.0,aggs=['mean'],drop_original=True)
    # X_train = TE.fit_transform(X_train, y_train)
    # X_val   = TE.transform(X_val)
    # X_test  = TE.transform(X_test)

    # # Opt2. -> Using RAPIDS
    # for c in INTER+ROUND: #
    #     TE = cuTE(n_folds=5, smooth=1.0, stat='mean', split_method='interleaved', seed=SEED)
    #     X_train[c] = TE.fit_transform(X_train[c], y_train)
    #     X_val[c] = TE.transform(X_val[c])
    #     X_test[c] = TE.transform(X_test[c])

    # for c in model_cats:
    #     X_train[c] = X_train[c].astype('category')
    #     X_val[c]   = pd.Categorical(X_val[c], categories=X_train[c].cat.categories)
    #     X_test[c]  = pd.Categorical(X_test[c], categories=X_train[c].cat.categories)

    tabpfn_model = TabPFNClassifier(**params)
    tabpfn_model.fit(X_train, y_train)

    X_val_preds = tabpfn_model.predict_proba(X_val)[:, 1]
    oof_parent[val_idx] = X_val_preds

    x_child_preds += tabpfn_model.predict_proba(X_child)[:, 1]
    x_test_preds += tabpfn_model.predict_proba(X_test)[:, 1]

    score = roc_auc_score(y_val, X_val_preds)
    fold_scores.append(score)

    print(f"{COLOR}AUC: {score:.5f}{RESET}")

    # del X_train, y_train, X_val, y_val, X_val_preds
    gc.collect()

oof_auc = roc_auc_score(y_parent, oof_parent)
avg_auc = np.mean(fold_scores)
print('-'*25)
print(f"{COLOR}TabPFN OOF AUC: {oof_auc:.5f}{RESET}")
print(f"{COLOR}TabPFN AVG AUC: {avg_auc:.5f}{RESET}")

print('-'*25)
tok = time()
tiktok = (tok-tik) / 60
models.append(tabpfn_model)

ℹ️ Device GPU: Tesla T4
ℹ️ No. of CPU: 2 cores
| --------------------
-- Fold 1/5 [32mAUC: 0.90183[0m
| --------------------
-- Fold 2/5 [32mAUC: 0.88713[0m
| --------------------
-- Fold 3/5 [32mAUC: 0.90732[0m
| --------------------
-- Fold 4/5 [32mAUC: 0.90173[0m
| --------------------
-- Fold 5/5 [32mAUC: 0.89574[0m
--------------------
Training Time: 99.77 mins[0m
--------------------
Training Complete! TabPFN added to train/test dataframes
--------------------


In [None]:
x_child_preds /= SPLITS
x_test_preds /= SPLITS

oof_df['tabPFN'] = oof_parent
train_df['tabPFN'] = np.clip(x_child_preds, 0, 1)
test_df['tabPFN'] = np.clip(x_test_preds, 0, 1)

print('-'*50)
print(f"TabPFN predictions added to train/test dataframes")
print('-'*50)
len(train_df), len(test_df)

In [None]:
oof_df.head()

In [None]:
train_df.head()

Unnamed: 0,realMLP,tabMD,tabPFN
0,0.950771,0.999651,0.999529
1,0.804589,0.842282,0.857477
2,0.925822,0.97271,0.981672
3,0.806939,0.826712,0.837671
4,0.927357,0.96445,0.981719


In [None]:
test_df.head()

Unnamed: 0,realMLP,tabMD,tabPFN
0,0.879079,0.92752,0.939235
1,0.947719,0.999631,0.998314
2,0.682412,0.697294,0.723108
3,0.88695,0.919878,0.937167
4,0.917761,0.959522,0.981078


In [None]:
# !rm -r /content/lightning_logs

In [None]:
oof_df.to_parquet('orig_NN_models_oof.parquet', index=False)
train_df.to_parquet('orig_NN_models_train.parquet', index=False)
test_df.to_parquet('orig_NN_models_test.parquet', index=False)