<a href="https://colab.research.google.com/github/cod3astro/kaggle_ML_competition/blob/main/loan_payback.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore', message=".*X does not have valid feature names.*")

In [3]:
train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')
train.head()

Unnamed: 0_level_0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [4]:
train.shape

(593994, 12)

In [5]:
train.isnull().any().any()

np.False_

In [6]:
train.describe(include='all')

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
count,593994.0,593994.0,593994.0,593994.0,593994.0,593994,593994,593994,593994,593994,593994,593994.0
unique,,,,,,3,4,5,5,8,30,
top,,,,,,Female,Single,Bachelor's,Employed,Debt consolidation,C3,
freq,,,,,,306175,288843,279606,450645,324695,58695,
mean,48212.202976,0.120696,680.916009,15020.297629,12.356345,,,,,,,0.79882
std,26711.942078,0.068573,55.424956,6926.530568,2.008959,,,,,,,0.400883
min,6002.43,0.011,395.0,500.09,3.2,,,,,,,0.0
25%,27934.4,0.072,646.0,10279.62,10.99,,,,,,,1.0
50%,46557.68,0.096,682.0,15000.22,12.37,,,,,,,1.0
75%,60981.32,0.156,719.0,18858.58,13.68,,,,,,,1.0


In [7]:
def create_derived_features(df):
    df = train.copy()
    print(f"Processing dataset shape: {df.shape}")

    # 1. Rounding Features
    # This reduces noise by grouping similar values
    for col in ['annual_income', 'loan_amount']:
        if col in df.columns:
            # Round to nearest whole number (1s)
            df[f'{col}_ROUND_1s'] = df[col].round(0).astype(int)
            # Round to nearest ten (10s)
            df[f'{col}_ROUND_10s'] = df[col].round(-1).astype(int)
            print(f"  -> Created rounding features for {col}")

    # 2. Subgrade and Grade Extraction
    # Splits "B2" into Grade "B" and Subgrade "2"
    if 'grade_subgrade' in df.columns:
        df['grade'] = df['grade_subgrade'].str[0]
        # taking the rest of the string and converting to int
        df['subgrade'] = df['grade_subgrade'].str[1:].astype(int)
        print(f"  -> Extracted grade and subgrade")

    # 3. Total Debt Burden Calculation
    # Formula: (Loan Amount * Interest Rate) / Annual Income
    req_cols = ['loan_amount', 'interest_rate', 'annual_income']
    if all(col in df.columns for col in req_cols):
        df['total_debt_burden'] = (
            (df['loan_amount'] * df['interest_rate'] / 100) /
            (df['annual_income'] + 1) # +1 prevents division by zero error
        )
        print(f"  -> Calculated total_debt_burden")

    return df

# --- EXECUTION ---
print("STEP 1: Creating Derived Features...")

# Apply to Train Data
df = create_derived_features(train)

print("\nDone! Check your dataframe head to see new columns.")

STEP 1: Creating Derived Features...
Processing dataset shape: (593994, 12)
  -> Created rounding features for annual_income
  -> Created rounding features for loan_amount
  -> Extracted grade and subgrade
  -> Calculated total_debt_burden

Done! Check your dataframe head to see new columns.


In [8]:
df.head()

Unnamed: 0_level_0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back,annual_income_ROUND_1s,annual_income_ROUND_10s,loan_amount_ROUND_1s,loan_amount_ROUND_10s,grade,subgrade,total_debt_burden
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0,29368,29370,2528,2530,C,3,0.011769
1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0,22108,22110,4593,4590,D,3,0.026841
2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0,49566,49570,17005,17010,C,5,0.033484
3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0,46858,46860,4682,4680,F,1,0.016088
4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0,25497,25500,12184,12180,D,1,0.04879


In [9]:
def create_derived_features(test_df):
    test_df = test.copy()
    print(f"Processing dataset shape: {test_df.shape}")

    # 1. Rounding Features
    # This reduces noise by grouping similar values
    for col in ['annual_income', 'loan_amount']:
        if col in test_df.columns:
            # Round to nearest whole number (1s)
            test_df[f'{col}_ROUND_1s'] = test_df[col].round(0).astype(int)
            # Round to nearest ten (10s)
            test_df[f'{col}_ROUND_10s'] = test_df[col].round(-1).astype(int)
            print(f"  -> Created rounding features for {col}")

    # 2. Subgrade and Grade Extraction
    # Splits "B2" into Grade "B" and Subgrade "2"
    if 'grade_subgrade' in test_df.columns:
        test_df['grade'] = test_df['grade_subgrade'].str[0]
        # taking the rest of the string and converting to int
        test_df['subgrade'] = test_df['grade_subgrade'].str[1:].astype(int)
        print(f"  -> Extracted grade and subgrade")

    # 3. Total Debt Burden Calculation
    # Formula: (Loan Amount * Interest Rate) / Annual Income
    req_cols = ['loan_amount', 'interest_rate', 'annual_income']
    if all(col in test_df.columns for col in req_cols):
        test_df['total_debt_burden'] = (
            (test_df['loan_amount'] * test_df['interest_rate'] / 100) /
            (test_df['annual_income'] + 1) # +1 prevents division by zero error
        )
        print(f"  -> Calculated total_debt_burden")

    return test_df

# --- EXECUTION ---
print("STEP 1: Creating Derived Features...")
test_df = create_derived_features(test)

print("\nDone! Check your dataframe head to see new columns.")

STEP 1: Creating Derived Features...
Processing dataset shape: (254569, 11)
  -> Created rounding features for annual_income
  -> Created rounding features for loan_amount
  -> Extracted grade and subgrade
  -> Calculated total_debt_burden

Done! Check your dataframe head to see new columns.


In [10]:
cols_to_drop = ['loan_amount_ROUND_1s', 'loan_amount_ROUND_10s', 'annual_income_ROUND_1s', 'annual_income_ROUND_10s', 'grade_subgrade']
df = df.drop(columns=cols_to_drop)

In [11]:
df.columns

Index(['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount',
       'interest_rate', 'gender', 'marital_status', 'education_level',
       'employment_status', 'loan_purpose', 'loan_paid_back', 'grade',
       'subgrade', 'total_debt_burden'],
      dtype='object')

In [12]:
feature_list = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade']
target_column = df['loan_paid_back']

In [13]:
X = df.drop(columns=['loan_paid_back'])
y_unflipped = df['loan_paid_back']
y = y_unflipped.map({1: 0, 0: 1})

In [14]:
from sklearn.model_selection import KFold

def apply_target_encoding(X, test_df, feature_list, target_column, n_folds=10):

    # Applies Target Encoding with K-Fold Cross-Validation to prevent data leakage.
    # Work on copies to avoid messing up original dfs
    X_encoded = X.copy()
    test_encoded = test_df.copy()

    # Setup K-Fold
    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)

    print(f"Starting Target Encoding on {len(feature_list)} features...")

    for feature in feature_list:
        # Create a new column name, e.g., 'mean_grade'
        new_col_name = f'mean_{feature}'

        # Initialize the new column with zeros
        X_encoded[new_col_name] = 0.0

        # --- PART A: ENCODE TRAINING DATA (With K-Fold) ---
        # We split train data into chunks. We calculate means on 9 chunks
        # and apply them to the 1 chunk we left out.
        for train_idx, val_idx in kfold.split(X_encoded):
            # Get the rows for this fold
            train_X_fold = X_encoded.iloc[train_idx]
            val_X_fold = X_encoded.iloc[val_idx]

            # Calculate mean target for each category in this fold
            target_means = train_X_fold.groupby(feature)[target_column].mean()

            # Map these means to the validation chunk
            X_encoded.loc[val_idx, new_col_name] = val_X_fold[feature].map(target_means)

        # Fill any NaNs in train (rare categories) with the global mean
        global_mean = X[target_column].mean()
        X_encoded[new_col_name] = X_encoded[new_col_name].fillna(global_mean)

        # --- PART B: ENCODE TEST DATA ---
        # For test data, we just use the mean from the ENTIRE training set
        # (This is allowed because test data is "future" data)
        global_target_means = X.groupby(feature)[target_column].mean()
        test_encoded[new_col_name] = test_encoded[feature].map(global_target_means)

        # Fill NaNs in test with global mean
        test_encoded[new_col_name] = test_encoded[new_col_name].fillna(global_mean)

        print(f"  -> Created {new_col_name}")

    return X_encoded, test_encoded

# --- EXECUTION ---

# 1. Define which columns to encode
# (Make sure these exist in your dataframe!)
cols_to_encode = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade']

# 2. Check if columns actually exist before running
valid_cols = [c for c in cols_to_encode if c in X.columns]

# 3. Run the function
train_df, test_df = apply_target_encoding(
    df,
    test_df,
    valid_cols,
    target_column='loan_paid_back'
)

y_encoded = train_df['loan_paid_back']
X_encoded = train_df.drop(columns=['loan_paid_back'])

print("\nTarget Encoding Complete.")

Starting Target Encoding on 6 features...
  -> Created mean_gender
  -> Created mean_marital_status
  -> Created mean_education_level
  -> Created mean_employment_status
  -> Created mean_loan_purpose
  -> Created mean_grade

Target Encoding Complete.


In [15]:
X_encoded.head()

Unnamed: 0_level_0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade,subgrade,total_debt_burden,mean_gender,mean_marital_status,mean_education_level,mean_employment_status,mean_loan_purpose,mean_grade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C,3,0.011769,0.801849,0.79862,0.809872,0.899373,0.802523,0.847408
1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D,3,0.026841,0.795701,0.799062,0.801979,0.89431,0.796776,0.715627
2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C,5,0.033484,0.795911,0.799196,0.809513,0.894361,0.797319,0.847379
3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F,1,0.016088,0.801921,0.799147,0.809975,0.894136,0.797424,0.625832
4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D,1,0.04879,0.795721,0.799128,0.809427,0.894031,0.802351,0.715151


In [16]:
numerical_col = df.select_dtypes(include=np.number).columns
numerical_col = numerical_col.drop('loan_paid_back')

In [17]:
categorical_col = df.select_dtypes(include='object').columns
for col in categorical_col:
    unique_values = df[col].unique()
    print(f'{col} ({len(unique_values)} unique)')
    print(df[col].unique())

gender (3 unique)
['Female' 'Male' 'Other']
marital_status (4 unique)
['Single' 'Married' 'Divorced' 'Widowed']
education_level (5 unique)
['High School' "Master's" "Bachelor's" 'PhD' 'Other']
employment_status (5 unique)
['Self-employed' 'Employed' 'Unemployed' 'Retired' 'Student']
loan_purpose (8 unique)
['Other' 'Debt consolidation' 'Home' 'Education' 'Vacation' 'Car'
 'Medical' 'Business']
grade (6 unique)
['C' 'D' 'F' 'E' 'B' 'A']


In [18]:
train_df['loan_paid_back'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
loan_paid_back,Unnamed: 1_level_1
1.0,0.79882
0.0,0.20118


In [19]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix, roc_auc_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks, NearMiss
from scipy.stats import pointbiserialr, chi2_contingency
from imblearn.pipeline import Pipeline as ImbPipeline

In [20]:
education_order = ['High School', "Bachelor's", "Master's", 'PhD', 'Other']
grade_order = ['A', 'B', 'C', 'D', 'E', 'F'] # Define the hierarchy here

ordinal_col = ['education_level', 'grade']
onehot_col = ['gender', 'employment_status', 'loan_purpose', 'marital_status']

ordinal_transformer = OrdinalEncoder(
    categories=[education_order, grade_order],
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

numerical_transformer = StandardScaler()
onehot_transformer = OneHotEncoder(handle_unknown='ignore')

# 4. Create the Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_col),
        ('onehot', onehot_transformer, onehot_col),
        ('ordinal', ordinal_transformer, ordinal_col)
    ],
    remainder='passthrough'
)

In [21]:
train_X, val_X, train_y, val_y = train_test_split(X_encoded, y_encoded, random_state=42, stratify=y)

In [22]:
train_X.head()

Unnamed: 0_level_0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade,subgrade,total_debt_burden,mean_gender,mean_marital_status,mean_education_level,mean_employment_status,mean_loan_purpose,mean_grade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
83160,73381.14,0.213,695,8520.98,17.02,Male,Married,Bachelor's,Unemployed,Home,C,2,0.019763,0.79565,0.79844,0.788336,0.077135,0.822113,0.847003
58330,18408.16,0.437,619,15458.58,12.23,Male,Married,Bachelor's,Self-employed,Debt consolidation,D,2,0.102698,0.79565,0.79844,0.788336,0.898406,0.796339,0.714917
189451,37358.21,0.156,683,10579.22,9.35,Female,Married,Bachelor's,Employed,Debt consolidation,C,4,0.026477,0.80183,0.799349,0.788979,0.894342,0.797094,0.847404
135863,28762.38,0.134,634,20880.06,12.77,Male,Married,High School,Employed,Debt consolidation,D,1,0.092701,0.795911,0.799289,0.809513,0.894361,0.797319,0.715782
334421,82407.15,0.121,668,12093.96,12.18,Female,Married,High School,Employed,Other,D,1,0.017875,0.801626,0.798972,0.809547,0.893961,0.802674,0.714624


In [23]:
negatives = (train_y == 1).sum()
positives = (train_y == 0).sum()
scale_weight = negatives / positives

print(f"Negatives: {negatives}, Positives: {positives}")
print(f"Calculated scale_pos_weight: {scale_weight:.2f}")

Negatives: 355870, Positives: 89625
Calculated scale_pos_weight: 3.97


In [24]:
xgb_model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(random_state=42, scale_pos_weight=scale_weight, max_depth=14, n_estimators=200, learning_rate=0.04))
    ])
xgb_model.fit(train_X, train_y)

# Get hard predictions for accuracy and confusion matrix
xgb_hard_preds = xgb_model.predict(val_X)

# Get probabilities for ROC AUC score (for the positive class, which is 1.0 in val_y)
xgb_probs = xgb_model.predict_proba(val_X)[:, 1]

print("Accuracy:", accuracy_score(val_y, xgb_hard_preds))
print('ROC AUC SCORE: ', roc_auc_score(val_y, xgb_probs))
print(confusion_matrix(val_y, xgb_hard_preds))

Accuracy: 0.900255220573876
ROC AUC SCORE:  0.917547849793391
[[ 16016  13859]
 [   953 117671]]


In [31]:
xgb_params = {
    'n_estimators': 1200,
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'scale_pos_weight': 3.17,
    'random_state': 42,
    'n_jobs': -1,
    'tree_method': 'hist'
}

xgbModel = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(**xgb_params))
    ])

print("Training XGBoost...")
xgbModel.fit(train_X, train_y)

# Generate Predictions
xgb_val_preds = xgbModel.predict_proba(val_X)[:, 1]

print('XGB ROC AUC: ', roc_auc_score(val_y, xgb_val_preds))

Training XGBoost...
XGB ROC AUC:  0.9214886463808716


In [32]:
cb_params = {
    'iterations': 1200,
    'learning_rate': 0.05,
    'depth': 6,
    'l2_leaf_reg': 5,
    'scale_pos_weight': 3.17,
    'verbose': 0,
    'random_state': 42,
    'allow_writing_files': False
}

cbModel = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', CatBoostClassifier(**cb_params))
    ])

print("Training CatBoost...")
cbModel.fit(train_X, train_y)

# Generate Predictions
cb_val_preds = cbModel.predict_proba(val_X)[:, 1]

print('CatBoost ROC AUC: ', roc_auc_score(val_y, cb_val_preds))

Training CatBoost...
CatBoost ROC AUC:  0.9213964051387571


In [25]:
lgbm_model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LGBMClassifier(random_state=42, verbosity=-1, subsample=1, num_leaves=31, n_estimators=200, learning_rate=0.1, colsample_bytree=0.7, ))
    ])

lgbm_model.fit(train_X, train_y)

# Get hard predictions for accuracy and confusion matrix
lgbm_hard_preds = lgbm_model.predict(val_X)

# Get probabilities for ROC AUC score (for the positive class, which is 1.0 in val_y)
lgbm_probs = lgbm_model.predict_proba(val_X)[:, 1]

print("Accuracy:", accuracy_score(val_y, lgbm_hard_preds))
print('ROC AUC SCORE: ', roc_auc_score(val_y, lgbm_probs))
print(confusion_matrix(val_y, lgbm_hard_preds))

Accuracy: 0.9052316850618523
ROC AUC SCORE:  0.9215397768893634
[[ 18031  11844]
 [  2229 116395]]


In [26]:
lgbmModel = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LGBMClassifier(random_state=42, verbosity=-1, subsample= 0.6, scale_pos_weight= 3.1765, reg_lambda= 10, reg_alpha= 0, num_leaves= 50,
                                     n_estimators= 1000, max_depth= 5, learning_rate= 0.1, n_jobs= -1 ))
    ])

lgbmModel.fit(train_X, train_y)

# Get hard predictions for accuracy and confusion matrix
lgbm_preds = lgbmModel.predict(val_X)

# Get probabilities for ROC AUC score (for the positive class, which is 1.0 in val_y)
lgbm_proba = lgbmModel.predict_proba(val_X)[:, 1]

print("Accuracy:", accuracy_score(val_y, lgbm_preds))
print('ROC AUC SCORE: ', roc_auc_score(val_y, lgbm_proba))
print(confusion_matrix(val_y, lgbm_preds))

Accuracy: 0.9004101037717426
ROC AUC SCORE:  0.9228302267676328
[[ 15741  14134]
 [   655 117969]]


In [27]:
cols_to_drop = ['loan_amount_ROUND_1s', 'loan_amount_ROUND_10s', 'annual_income_ROUND_1s', 'annual_income_ROUND_10s', 'grade_subgrade']
test = test_df.drop(columns=cols_to_drop)
test.head()

Unnamed: 0_level_0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade,subgrade,total_debt_burden,mean_gender,mean_marital_status,mean_education_level,mean_employment_status,mean_loan_purpose,mean_grade
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
593994,28781.05,0.049,626,11461.42,14.73,Female,Single,High School,Employed,Other,D,5,0.058657,0.801708,0.798873,0.809698,0.894145,0.802377,0.715334
593995,46626.39,0.093,732,15492.25,12.85,Female,Married,Master's,Employed,Other,C,1,0.042695,0.801708,0.799144,0.802346,0.894145,0.802377,0.84726
593996,54954.89,0.367,611,3796.41,13.29,Male,Single,Bachelor's,Employed,Debt consolidation,D,1,0.009181,0.795752,0.798873,0.788892,0.894145,0.796911,0.715334
593997,25644.63,0.11,671,6574.3,9.57,Female,Single,Bachelor's,Employed,Debt consolidation,C,3,0.024533,0.801708,0.798873,0.788892,0.894145,0.796911,0.84726
593998,25169.64,0.081,688,17696.89,12.8,Female,Married,PhD,Employed,Business,C,1,0.089994,0.801708,0.799144,0.830067,0.894145,0.813104,0.84726


In [30]:
test_preds = lgbmModel.predict_proba(test)[:, 1]
submission_df = pd.DataFrame({
    'id': test.index, # Ensure this matches the ID of your test file
    'loan_paid_back': test_preds
})

submission_df.to_csv('lgbm_single.csv', index=False)
print("Submission file saved successfully!")

Submission file saved successfully!


In [33]:
test_preds = xgbModel.predict_proba(test)[:, 1]
submission_df = pd.DataFrame({
    'id': test.index, # Ensure this matches the ID of your test file
    'loan_paid_back': test_preds
})

submission_df.to_csv('xgb_single.csv', index=False)
print("Submission file saved successfully!")

Submission file saved successfully!


In [34]:
test_preds = cbModel.predict_proba(test)[:, 1]
submission_df = pd.DataFrame({
    'id': test.index, # Ensure this matches the ID of your test file
    'loan_paid_back': test_preds
})

submission_df.to_csv('cb_single.csv', index=False)
print("Submission file saved successfully!")

Submission file saved successfully!


In [37]:
print("Generating Test Predictions...")

# lgbmModel should already be trained from your previous step
pred_lgbm = lgbmModel.predict_proba(test)[:, 1]
pred_cb = cbModel.predict_proba(test)[:, 1]

# --- 3. Blend (Ensemble) ---
# 50% LightGBM + 50% CatBoost
final_blend = (pred_lgbm * 0.6) + (pred_cb * 0.4)

# --- 4. Create Submission File ---
submission_df = pd.DataFrame({
    'id': test.index,
    'loan_paid_back': final_blend
})

submission_df.to_csv('ensemble_lgbm_cb.csv', index=False)
print("Ensemble submission saved successfully!")

Generating Test Predictions...
Ensemble submission saved successfully!


In [28]:
N_FOLDS = 10
kf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

# Lists to store results
fold_scores = []
test_preds = [] # We will store 5 arrays of predictions here

print(f"Starting {N_FOLDS}-Fold Training...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_encoded, y)):

    # A. Split Data
    X_train, X_val = X_encoded.iloc[train_idx], X_encoded.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # B. Preprocessing then Fit on Train, Transform Val and Test
    X_train_proc = preprocessor.fit_transform(X_train)
    X_val_proc = preprocessor.transform(X_val)
    test_proc = preprocessor.transform(test)

    # C. Calculate Class Weight Ratio
    ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)

    # D. Train LightGBM
    model = lgb.LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=5,
        scale_pos_weight=ratio,
        verbosity=-1,
        random_state=42,
        n_jobs=-1
    )

    model.fit(
        X_train_proc, y_train,
        eval_set=[(X_val_proc, y_val)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(stopping_rounds=50)]
    )

    # E. Evaluate
    val_probs = model.predict_proba(X_val_proc)[:, 1]
    score = roc_auc_score(y_val, val_probs)
    fold_scores.append(score)
    print(f"Fold {fold+1} AUC: {score:.5f}")

    # F. Predict on TEST Data
    fold_test_pred = model.predict_proba(test_proc)[:, 1]
    test_preds.append(fold_test_pred)

# --- 4. AVERAGE PREDICTIONS ---
print(f"\nAverage Local CV Score: {np.mean(fold_scores):.5f}")

# We now have a list of 5 arrays. We average them column-wise.
# shape of test_preds is (n_folds, n_test_rows) -> we mean across axis 0
final_predictions = np.mean(test_preds, axis=0)

# --- 5. SUBMISSION ---
submission_df = pd.DataFrame({
    'id': test_df.index, # Ensure this matches the ID of your test file
    'loan_paid_back': final_predictions
})

submission_df.to_csv('lgbm_single_model_kfold.csv', index=False)
print("Submission file saved successfully!")

Starting 10-Fold Training...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[51]	valid_0's auc: 0.916828	valid_0's binary_logloss: 0.333938
Fold 1 AUC: 0.91683
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[47]	valid_0's auc: 0.913838	valid_0's binary_logloss: 0.33778
Fold 2 AUC: 0.91384
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[48]	valid_0's auc: 0.914654	valid_0's binary_logloss: 0.335948
Fold 3 AUC: 0.91465
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[47]	valid_0's auc: 0.914633	valid_0's binary_logloss: 0.336496
Fold 4 AUC: 0.91463
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[45]	valid_0's auc: 0.911562	valid_0's binary_logloss: 0.338458
Fold 5 AUC: 0.91156
Training until validation scores don't improve for 50 rounds
Early stopp

In [72]:
# Calculate class weight ratio once
ratio = float(np.sum(y == 0)) / np.sum(y == 1)
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [73]:
# ==========================================
# MODEL: LIGHTGBM TUNING
# ==========================================
print("\n--- Tuning LightGBM ---")
lgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', lgb.LGBMClassifier(objective='binary', verbosity=-1, n_jobs=-1, random_state=42))
])

lgb_param_grid = {
    'classifier__n_estimators': [500, 1000],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__num_leaves': [20, 31, 50],
    'classifier__max_depth': [-1, 5, 10],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__scale_pos_weight': [ratio, ratio * 0.8],
    'classifier__reg_lambda': [0, 5, 10],
    'classifier__reg_alpha': [0, 0.1, 1]
}

search_lgb = RandomizedSearchCV(
    lgb_pipeline, lgb_param_grid, n_iter=20, scoring='roc_auc', cv=cv, verbose=1, random_state=42, n_jobs=-1
)
search_lgb.fit(X_encoded, y)

print(f"Best LGBM Score: {search_lgb.best_score_:.4f}")
print("Best LGBM Params:", search_lgb.best_params_)


--- Tuning LightGBM ---
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best LGBM Score: 0.9216
Best LGBM Params: {'classifier__subsample': 0.6, 'classifier__scale_pos_weight': np.float64(3.1765288702928873), 'classifier__reg_lambda': 10, 'classifier__reg_alpha': 0, 'classifier__num_leaves': 50, 'classifier__n_estimators': 500, 'classifier__max_depth': 5, 'classifier__learning_rate': 0.1}


In [29]:
best_lgbm_params = {
    'subsample': 0.6,
    'scale_pos_weight': 3.1765, # Using the value found
    'reg_lambda': 10,
    'reg_alpha': 0,
    'num_leaves': 50,
    'n_estimators': 1000, # I increased this to 1000 to allow Early Stopping to find the perfect spot
    'max_depth': 5,
    'learning_rate': 0.1,
    'verbosity': -1,
    'random_state': 42,
    'n_jobs': -1
}
N_FOLDS = 10
kf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

fold_scores = []
test_preds = []

print(f"Starting {N_FOLDS}-Fold Training with Optimized Params...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_encoded, y)):

    # A. Split Data
    X_train, X_val = X_encoded.iloc[train_idx], X_encoded.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # B. Preprocessing
    X_train_proc = preprocessor.fit_transform(X_train)
    X_val_proc = preprocessor.transform(X_val)
    test_proc = preprocessor.transform(test)

    # C. Train Model
    model = lgb.LGBMClassifier(**best_lgbm_params)

    model.fit(
        X_train_proc, y_train,
        eval_set=[(X_val_proc, y_val)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(stopping_rounds=100)] # Stops if no improvement for 100 rounds
    )

    # D. Evaluate
    val_probs = model.predict_proba(X_val_proc)[:, 1]
    score = roc_auc_score(y_val, val_probs)
    fold_scores.append(score)
    print(f"Fold {fold+1} AUC: {score:.5f}")

    # E. Predict on Test
    test_preds.append(model.predict_proba(test_proc)[:, 1])

# --- 4. AVERAGE & SUBMIT ---
print(f"\nAverage CV Score: {np.mean(fold_scores):.5f}")

final_predictions = np.mean(test_preds, axis=0)

submission_df = pd.DataFrame({
    'id': test_df.index,
    'loan_paid_back': final_predictions
})

submission_df.to_csv('submission_lgbm_optimized_10fold.csv', index=False)
print("Optimized submission saved successfully!")

Starting 10-Fold Training with Optimized Params...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[761]	valid_0's auc: 0.924761	valid_0's binary_logloss: 0.293164
Fold 1 AUC: 0.92476
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[573]	valid_0's auc: 0.921838	valid_0's binary_logloss: 0.300108
Fold 2 AUC: 0.92184
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[751]	valid_0's auc: 0.923672	valid_0's binary_logloss: 0.294871
Fold 3 AUC: 0.92367
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[693]	valid_0's auc: 0.923027	valid_0's binary_logloss: 0.297694
Fold 4 AUC: 0.92303
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[760]	valid_0's auc: 0.919624	valid_0's binary_logloss: 0.300369
Fold 5 AUC: 0.91962
Training until validation scores don't 