<a href="https://www.kaggle.com/code/dalloliogm/playgrounds5e11-autogluon?scriptVersionId=272717894" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **FOREWORD**

This work is based on the amazing baseline notebook from Ravi: https://www.kaggle.com/code/ravi20076/playgrounds5e11-public-baseline-v1

I've modified to use Autogluon instead.

## Installing autogluon

This used to be a pain with pip. Now, `uv` works so much better.

In [None]:
!uv pip install autogluon

In [None]:
import warnings, torch
import pandas as pd, numpy as np
warnings.simplefilter('ignore')
from itertools import combinations
from tqdm.notebook import tqdm

from sklearn.model_selection import StratifiedKFold, KFold
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBMC, log_evaluation, early_stopping
from catboost import CatBoostClassifier as CBC
from sklearn.metrics import *

from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
test_req = False

if test_req :
    print("THIS IS A SYNTAX CHECK RUN")
    nest = 200
else:
    nest = 7000

# **PREPROCESSING**

In [None]:

train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
orig = pd.read_csv('/kaggle/input/loan-prediction-dataset-2025/loan_dataset_20000.csv')

print('Train Shape:', train.shape)
print('Test Shape:', test.shape)
print('Orig Shape:', orig.shape)

TARGET = 'loan_paid_back'
CATS   = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']
BASE   = [col for col in train.columns if col not in ['id', TARGET]]

# **FEATURE ENGINEERING**

In [None]:

INTER = []

for col1, col2 in tqdm(combinations(BASE, 2)):
    new_col_name = f'{col1}_{col2}'
    INTER.append(new_col_name)
    for df in [train, test, orig] :
        df[new_col_name] = df[col1].astype(str) + '_' + df[col2].astype(str)
        
print(f'{len(INTER)} Features')

for col1, col2, col3 in combinations(CATS, 3 ):
    new_col_name = f'{col1}_{col2}_{col3}'
    INTER.append(new_col_name)
    for df in [train, test, orig]:
        df[new_col_name] = df[col1].astype(str) + '_' + df[col2].astype(str) + df[col3].astype(str)
        
print(f'{len(INTER)} Features')

ORIG = []

for col in BASE:
    # MEAN
    mean_map = orig.groupby(col)[TARGET].mean()
    new_mean_col_name = f"orig_mean_{col}"
    mean_map.name = new_mean_col_name
    
    train = train.merge(mean_map, on=col, how='left')
    test = test.merge(mean_map, on=col, how='left')
    ORIG.append(new_mean_col_name)

    # COUNT
    new_count_col_name = f"orig_count_{col}"
    count_map = orig.groupby(col).size().reset_index(name=new_count_col_name)
    
    train = train.merge(count_map, on=col, how='left')
    test  = test.merge(count_map, on=col, how='left')
    ORIG.append(new_count_col_name)

print(len(ORIG), 'Orig Features Created!!')

FEATURES = BASE + ORIG + INTER
print(len(FEATURES), 'Features.')

X = train[FEATURES]
y = train[TARGET]

# **MODEL TRAINING**

In [None]:
train

In [None]:
from autogluon import TabularPredictor

In [None]:
import os
def is_interactive():
   return os.environ.get('KAGGLE_KERNEL_RUN_TYPE','') == "Interactive"
print("is interactive session?", is_interactive())
preset_quality = "medium_quality" if is_interactive() else "best_quality"

time_limit = 60 if is_interactive() else 3600

In [None]:
predictor = TabularPredictor(label = TARGET,
                             problem_type = 'regression',
                             eval_metric = 'rmse')

# TRAIN AUTOGLUON
predictor.fit(train,
              presets = 'best_quality',
              time_limit = time_limit,
              auto_stack = True,
              #num_bag_folds = 7,
              #num_bag_sets = 3,
              num_cpus = 4,
              verbosity = 1,
              #ag_args_fit={'num_gpus': 1}
             )

In [None]:
predictor.leaderboard()


In [None]:
importance_df = predictor.feature_importance(merge_train[:100])

importance_df.style.background_gradient(subset=['importance', 'stddev'], cmap='Blues')

In [None]:
imp = importance_df['importance'].sort_values(ascending=True)

plt.figure(figsize=(6, 8))
imp.plot(kind='barh', color='steelblue')
plt.title('Feature Importance (AutoGluon)')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.show()

In [None]:

# TEST DATA PREDICTION
y_test = predictor.predict(merge_test)

submission = pd.read_csv(
        f"/kaggle/input/playground-series-s5e11/sample_submission.csv")

submission["loan_paid_back"] = y_test

submission.to_csv("submission.csv")
submission