<a href="https://colab.research.google.com/github/gRedDeer/kaggle_notebooks/blob/main/PS4E10_submission_real.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Kaggle
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/Kaggle"

Mounted at /content/drive
/content/drive/MyDrive/Kaggle


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install catboost
!pip install xgboost
!pip install lightgbm

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.base import clone
from sklearn.pipeline import Pipeline
import gc
import xgboost as xgb
from xgboost import XGBClassifier,XGBRFClassifier, DMatrix, plot_importance as xgb_plot_importance
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier, plot_importance
from lightgbm.callback import early_stopping
import warnings

warnings.filterwarnings('ignore')

from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import Ridge


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/Kaggle/Loan Approval Prediction/train.csv', index_col='id')
test_df = pd.read_csv('/content/drive/MyDrive/Kaggle/Loan Approval Prediction/train.csv', index_col='id')
orig_df = pd.read_csv('/content/drive/MyDrive/Kaggle/Loan Approval Prediction/credit_risk_dataset.csv')

In [None]:
orig_df.index
train_df = pd.concat([train_df, orig_df])
train_df = train_df.reset_index(drop=True)
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.isna().sum()

# **EDA & Feature Engineering**

&nbsp;

In [None]:
#fixing nan values with means
train_df['loan_int_rate'] = train_df['loan_int_rate'].fillna(train_df['loan_int_rate'].mean())
train_df['person_emp_length'] = train_df['person_emp_length'].fillna(train_df['person_emp_length'].mean())

In [None]:
#adding some new features
for df in [train_df, test_df]:
  df['partial_net_worth'] = df['person_income'] * df['person_emp_length']
  df['monthly_debt'] = (df['loan_amnt'] * (1 + df['loan_int_rate'].astype(float)) / 12)

In [None]:
#extracting targets to plot features
target = train_df['loan_status']
features = train_df.drop('loan_status', axis=1).columns.tolist()

In [None]:
#listing numerical & categorical features
cat_features = train_df.select_dtypes(include='object').columns.tolist()
num_features = list(set(features) - set(cat_features))

In [None]:
#quick data view
train_df.describe(include='all').T.sort_values(by='unique')

In [None]:
for col in cat_features:
    display(train_df.groupby(col)[['loan_amnt', 'loan_int_rate']].agg(['min', 'mean', 'max']))

In [None]:
plt.figure(figsize=(16, 12))
for i,col in enumerate(num_features):

  plt.subplot(4,3,i+1)

  sns.histplot(train_df, x=col, hue=target)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(16, 12))
for i,col in enumerate(cat_features):
  plt.subplot(2,2,i+1)
  sns.countplot(train_df, x=col, hue=target)

plt.tight_layout()
plt.show()

In [None]:
sns.countplot(train_df, x=target, hue=target)
plt.show()

In [None]:
train_new = train_df.copy()

for col in cat_features:
    train_new[col], _ = train_new[col].factorize()

cor_mat = train_new.corr()
mask = np.triu(cor_mat)

plt.figure(figsize=(10, 8))
sns.heatmap(cor_mat, fmt='.2f', annot=True, mask=mask, cmap='coolwarm')
plt.tight_layout()
plt.show()

In [None]:
# cv intialize
skfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
# another function
def convert_to_string(df):
    df_cat = df.copy()
    df_cat = df_cat.fillna(0)
    for col in features:
        df_cat[col] = df_cat[col].astype('string')
    return df_cat

In [None]:
# Defining our X and y
X = train_df.drop('loan_status', axis=1)
y = target.ravel()

In [None]:
# intitating our train/test dicts
train_preds = {}
test_preds = {}
auc_mean_scores = {}

In [None]:
# CATB predictor - using GPU however catboot is still not fully optimised for GPUs so AUCs will still run on CPU

# Variables to store out-of-fold (OOF) predictions and feature importances
import torch
oof_preds = []
oof_aucs = []
oof_train_preds = np.zeros(len(y))

feature_importance_df = pd.DataFrame()  # DataFrame to store feature importances

cat_params = {
    'task_type': "GPU" if torch.cuda.is_available() else "CPU",
    #'devices': 0,
    'loss_function': 'Logloss',
    'eval_metric': "AUC",
    'bagging_temperature': 0.25,
    'iterations': 10000,
    'learning_rate': 0.045,
    'max_depth': 7,
    'l2_leaf_reg': 0.80,
    'min_data_in_leaf': 30,
    'random_strength': 0.25,
    'random_state': 42,
    'early_stopping_rounds': 200,
    'use_best_model': True,
    'allow_writing_files': False,
}

X_cat = convert_to_string(X)
test_cat = convert_to_string(test_df)

test_pool = Pool(test_cat, cat_features=features)

# Initialize to store one model for feature importance
best_cat_clf = None

for fold, (train_idx, test_idx) in enumerate(skfold.split(X_cat, y)):
    X_train, y_train = X_cat.iloc[train_idx], y[train_idx]
    X_test, y_test = X_cat.iloc[test_idx], y[test_idx]

    X_train_pool = Pool(X_train, y_train, cat_features=features)
    X_test_pool = Pool(X_test, y_test, cat_features=features)

    cat_clf = CatBoostClassifier(**cat_params)
    cat_clf = cat_clf.fit(X=X_train_pool, eval_set=X_test_pool, verbose=0, early_stopping_rounds=200)


    # Out-of-fold predictions
    oof_train_preds[test_idx] = cat_clf.predict_proba(Pool(X_test, cat_features=features))[:, 1]
    test_pred = cat_clf.predict_proba(test_pool)[:, 1]

    # Save feature importance for this fold
    fold_importance = cat_clf.get_feature_importance(Pool(X_test, cat_features=features))

    # Store fold importances in a DataFrame
    fold_importance_df = pd.DataFrame({
        'feature': features,
        f'importance_fold_{fold + 1}': fold_importance
    })

    # If it's the first fold, keep both 'feature' and importance
    if fold == 0:
        feature_importance_df = fold_importance_df
    else:
        # Concatenate only the importance columns for subsequent folds
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df[f'importance_fold_{fold + 1}']], axis=1)

    oof_preds.append(test_pred)
    auc = cat_clf.best_score_['validation']['AUC']
    oof_aucs.append(auc)
    print(f"\nFold {fold+1} --> ROC-AUC Score: {auc:.6f}\n")

    # Save the best model for feature importance
    if best_cat_clf is None or auc > max(oof_aucs):
        best_cat_clf = cat_clf

    del X_train, y_train, X_test, y_test
    del X_train_pool, X_test_pool
    del cat_clf
    gc.collect()

# Calculate the mean feature importance across folds
importance_cols = [col for col in feature_importance_df.columns if 'importance_fold' in col]
feature_importance_df['mean_importance'] = feature_importance_df[importance_cols].mean(axis=1)

# Select the top 10 features by mean importance
top_features = feature_importance_df.nlargest(6, 'mean_importance')

# Plot the top 10 important features
plt.figure(figsize=(5, 3))
sns.barplot(x='mean_importance', y='feature', data=top_features)
plt.title('Top 6 Feature Importance (CatBoost)')
plt.xlabel('Mean Importance')
plt.ylabel('Feature')
plt.show()

# Average and standard deviation of AUCs
auc_mean = np.mean(oof_aucs)
auc_mean_scores['cat'] = auc_mean
auc_std = np.std(oof_aucs)
print(f"\nAverage Fold ROC-AUC Score: {auc_mean:.6f} ± {auc_std:.6f}\n")

train_preds['cat'] = oof_train_preds
test_pred_cat = np.mean(oof_preds, axis=0)
test_preds['cat'] = test_pred_cat

In [None]:
test_preds_df = pd.DataFrame(test_preds)
train_preds_df = pd.DataFrame(train_preds)

In [None]:
sub = pd.read_csv('/content/drive/MyDrive/Kaggle/Loan Approval Prediction/sample_submission.csv')
sub['loan_status'] = test_preds_df['cat']
csv_str = '/content/drive/MyDrive/Kaggle/Loan Approval Prediction/submission'+(auc_mean_scores['cat']*1000000).astype(int).astype(str)+'.csv'
sub.to_csv(csv_str, index=False)

In [None]:
csv_str

In [None]:
!kaggle competitions submit -c playground-series-s4e10 -f '/content/drive/MyDrive/Kaggle/Loan Approval Prediction/submission967769.csv' -m "colab_2"